Introduction

In previous documents, I have:

  • defined LADs that change during the cell cycle
  • tried to link these to other data sets

This was done very inefficiently, unorganized and for the (three) cell types separately. I want to make this more generalized, structured and plot the resulting output in a more summarized fashion.

Method

Overall, this document will replace the documents linking the dynamic LADs to other data sets. The method will be the same, although I would like to design a different output figure to summarize all data much conveniently. The strategy is summarized below.

Analysis procedure

Various plots.

Set-up

Initially, I want to prepare the input and output. Load in the libraries and set other constants.

Set-up knitr

I always appreciate having the markdown figures as png / pdf, as plotted in the document. This is set below.

List data sets to compare with

Below, I will provide the data sets that will be processed for this document. These consist of (when data is available) :

  • DamID
  • Replication timing
  • Gene expression
  • Histone modifications (repressive only)
# Input directories
dir.damid.base <- "/DATA/usr/t.v.schaik/proj/3D_nucleus/results/ts180110_4DN_DataProcessing/results/normalized/bin-20kb/"
dir.padamid.base <- "/DATA/usr/t.v.schaik/proj/tests/results/ts180813_GCF5083_pADamIDtests/results/normalized/bin-20kb/"
dir.repliseq.base <- "/DATA/usr/t.v.schaik/proj/tests/results/ts190301_pADamID_CellCycle/ts190731_4DN_RepliSeq/bigwigs/"
dir.chip.base <- "/DATA/usr/t.v.schaik/proj/tests/results/ts180813_GCF5083_pADamIDtests/ts190320_pADamID_correlations/histone_correlations/"

data.tracks <- list(
  
  # Hap1
  Hap1 = list(damid_lmnb1 = file.path(dir.damid.base,
                                "Hap1_LMNB1-20kb-combined.norm.txt.gz"),
              padamid_h3k27me3 = file.path(dir.padamid.base,
                                           "Hap1_H3K27me3-20kb-combined.norm.txt.gz"),
              padamid_h3k9me3 = file.path(dir.padamid.base,
                                          "Hap1_H3K9me3-20kb-combined.norm.txt.gz")),

  # HCT116
  HCT116 = list(damid_lmnb1 = file.path(dir.damid.base,
                                        "HCT116_LMNB1-20kb-combined.norm.txt.gz"),
                repliseq = file.path(dir.repliseq.base, 
                                     "HCT116_r1_20kb.bw"),
                chip_h3k27me3 = file.path(dir.chip.base,
                                          "ChIPseq-HCT116_H3K27me3-20kb.bw"),
                chip_h3k9me3 = file.path(dir.chip.base,
                                         "ChIPseq-HCT116_H3K9me3-20kb.bw")),
  
  # K562
  K562 = list(damid_lmnb1 = file.path(dir.damid.base,
                                      "K562_LMNB1-20kb-combined.norm.txt.gz"),
              repliseq = file.path(dir.repliseq.base, 
                                   "K562_r1_20kb.bw"),
              chip_h3k27me3 = file.path(dir.chip.base,
                                        "ChIPseq-K562_H3K27me3-20kb.bw"),
              chip_h3k9me3 = file.path(dir.chip.base,
                                       "ChIPseq-K562_H3K9me3-20kb.bw"),
              padamid_h3k27me3 = file.path(dir.padamid.base,
                                           "K562_H3K27me3-20kb-combined.norm.txt.gz"),
              padamid_h3k9me3 = file.path(dir.padamid.base,
                                          "pADamID-K562_r3_H3K9me3-20kb.norm.txt.gz"))
  
)

# Confirm that all the files exist
all(file.exists(unlist(data.tracks)))
## [1] TRUE

1) Load input

First, I will load the required input and prepare the data structure required.

4) Gene expression

How do the dynamic LADs correlate with gene expression?

df.combined <- c()

for (cell in cells) {
  
  print(cell)
  
  # LAD status to genes
  LADs <- LADs.norm[[cell]]
  ovl <- findOverlaps(rnaseq, LADs, type = "within")
  
  # Convert to df
  df <- as(mcols(rnaseq), "data.frame")[, c("gene_id", "gene_name", cell)]
  names(df)[3] <- "expr"
  
  df$result <- factor("iLAD", levels = c(levels(LADs$G2_G1), "iLAD"))
  df$result[queryHits(ovl)] <- LADs$G2_G1[subjectHits(ovl)]
  
  # Complete cases
  df <- df[complete.cases(df), ]
  
  # Plot beeswarm
  plt <- ggplot(df, aes(x = result, y = expr, col = result, group = result)) +
    geom_quasirandom() +
    geom_boxplot(outlier.shape = NA, fill = NA, col = "black", width = 0.3) +
    ggtitle("Gene change versus gene expression") +
    xlab("pA-DamID change") +
    ylab("Gene expression (rlog2)") +
    scale_color_brewer(palette = "Set1") +
    theme_bw() +
    theme(aspect.ratio = 1)
  
  # plot(plt)
  
  
  # What are active genes?
  plt <- ggplot(df, aes(x = expr)) +
    geom_histogram(binwidth = 1) +
    geom_vline(xintercept = 7, col = "red", linetype = "dashed") +
    xlab("Gene expression (rlog)") +
    ylab("Count") +
    ggtitle("Gene expression histogram") +
    theme_bw() +
    theme(aspect.ratio = 1)
  
  # plot(plt)
  
  
  # Let's say rlog > 7
  cutoff <- 7
  
  # Gene density
  LADs$gene_density <- countOverlaps(LADs, rnaseq, type = "any") / width(LADs) * 1e6
  LADs$gene_density.active <- countOverlaps(LADs, rnaseq[mcols(rnaseq)[, cell] > cutoff], 
                                            type = "any") / width(LADs) * 1e6
  
  # Get the data frame
  df <- as(mcols(LADs), "data.frame")
  # df <- df[complete.cases(df), ]
  
  # Plot beeswarm
  plt <- ggplot(df, aes(x = G2_G1, y = gene_density, fill = G2_G1, group = G2_G1)) +
    geom_boxplot(outlier.shape = NA) +
    ggtitle("Gene change versus gene density") +
    xlab("pA-DamID change") +
    ylab("# genes / Mb") +
    scale_color_brewer(palette = "Set1", name = "result") +
    coord_cartesian(ylim = c(0, 18)) +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
  
  # plot(plt)
  
  plt <- ggplot(df, aes(x = G2_G1, y = gene_density.active, fill = G2_G1, group = G2_G1)) +
    geom_boxplot(outlier.shape = NA) +
    ggtitle("Gene change versus gene density") +
    xlab("pA-DamID change") +
    ylab("# active genes / Mb") +
    scale_color_brewer(palette = "Set1", name = "result") +
    coord_cartesian(ylim = c(0, 6)) +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
  
  # plot(plt)
  
  df$cell <- cell
  df.combined <- rbind(df.combined, 
                       df[, c("gene_density.active", "cell", "G2_G1")])
  
}
## [1] "Hap1"
## [1] "HCT116"
## [1] "K562"

Variable.

5) Replication timing

How are the replication timing scores for these changing domains?

## [1] "HCT116"
## [1] "K562"

Variable.

7) Chromosomal positioning

Distance to telomeres / centromeres.

# Read chrom sizes
chrom_sizes <- read.table("/DATA/usr/t.v.schaik/data/genomes/hg19/hg19_chrom_sizes.txt", sep = "\t")
row.names(chrom_sizes) <- chrom_sizes[, 1]

# Function to scale chromosome arms
ScaleChromosomeArms <- function(gr, chrom_sizes, centromeres.gr, inverse = FALSE) {
  # For a given "gr" GRanges object, calculate the relative distance to a
  # chromosome arm. 
  
  # Calculate distance to centromere for every gene
  gr$distance_to_centromere <- mcols(distanceToNearest(gr, centromeres.gr))$distance
  gr$distance_to_centromere <- as.numeric(gr$distance_to_centromere)
  
  # For each chromosome
  #   if start of gene is left of the centromere,
  #     normalize the distance by the first base - start centromere distance
  #   else
  #     normalize by the distance by the end centromere - last base
  for (chr in seqlevels(gr)) {
    centromeres.chr <- centromeres.gr[seqnames(centromeres.gr) == chr]
    left_arm <- start(centromeres.chr) - 1
    right_arm <- chrom_sizes[chr, 2] - end(centromeres.chr)
    
    idx <- seqnames(gr) %in% chr
    gr.tmp <- gr[idx]
    
    norm_dis <- ifelse(start(gr.tmp) < start(centromeres.chr),
                       gr.tmp$distance_to_centromere / left_arm,
                       gr.tmp$distance_to_centromere / right_arm)
    
    # If inverse, inverse the distance (meaning towards telomeres)
    if (inverse) {
      norm_dis <- 1 - norm_dis
    }
    
    mcols(gr[idx])[, "distance_to_centromere"] <- norm_dis
  }
  
  # Return normalized distances
  gr$distance_to_centromere
}

# Define telomeres & load centromeres
telomeres <- GRanges(seqnames = rep(seqlevels(LADs), times = 2),
                     ranges = IRanges(start = c(rep(1, length(seqlevels(LADs))),
                                                seqlengths(LADs)),
                                      end = c(rep(1, length(seqlevels(LADs))),
                                              seqlengths(LADs))))
centromeres <- import("ts190708_differential_analysis_BinsandLADs_K562/centromeres.bed")

# Prep output df
df.combined <- c()

for (cell in cells) {
  
  LADs <- LADs.norm[[cell]]
  
  # Distance to telomeres data frame
  LADs$distance_to_telomeres <- mcols(distanceToNearest(LADs, telomeres))$distance / 1e6
  LADs$distance_to_centromeres <- mcols(distanceToNearest(LADs, centromeres))$distance / 1e6
  LADs$distance_to_telomeres_scaled <- ScaleChromosomeArms(LADs, chrom_sizes, 
                                                             centromeres, inverse = T)
  
  LADs$seqnames <- seqnames(LADs)
  
  # Add mean G1 / G2
  mcols(LADs)[, "G1"] <- rowMeans(as(mcols(LADs), "data.frame")[, samples.df[[cell]]$name[samples.df[[cell]]$phase == "G1"]])
  mcols(LADs)[, "G2"] <- rowMeans(as(mcols(LADs), "data.frame")[, samples.df[[cell]]$name[samples.df[[cell]]$phase == "G2"]])
  mcols(LADs)[, "diff"] <- LADs$G2 - LADs$G1
  
  # Add LAD size
  mcols(LADs)[, "size"] <- width(LADs) / 1e6
  
  df <- as(mcols(LADs), "data.frame")
  
  df$cell <- cell
  df.combined <- rbind(df.combined, 
                       df[, c("distance_to_telomeres", "distance_to_centromeres", 
                              "distance_to_telomeres_scaled", "G1", "G2", "diff",
                              "size", "cell", "G2_G1", "seqnames")])
  
  
}

df.combined$cell <- factor(df.combined$cell, levels = c("Hap1", "HCT116", "K562"))

# Order chromosomes by length
df.combined$seqnames <- factor(df.combined$seqnames,
                               levels = seqlevels(LADs)[order(seqlengths(LADs), 
                                                              decreasing = T)])

# Plots
# 1) Telomere distance
plt <- ggplot(df.combined, aes(x = cell, y = distance_to_telomeres, 
                               fill = G2_G1)) +
  geom_boxplot(outlier.shape = NA, position = "dodge") +
  ggtitle("Telomeres") +
  xlab("") +
  ylab("Distance to telomeres (Mb)") +
  scale_x_discrete(drop = FALSE) +
  scale_fill_manual(values = brewer.pal(9, "Set1")[c(9, 1, 3)],
                    name = "result") +
  #coord_cartesian(ylim = c(0, 8)) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

plot(plt)

Conclusion

Important message: there is no correlation between DNA characteristics and lamina cell cycle dynamics. The main feature really is telomere positioning.

SessionInfo

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/libblas/libblas.so.3.6.0
## LAPACK: /usr/lib/lapack/liblapack.so.3.6.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] knitr_1.22           gtools_3.8.1         edgeR_3.26.0        
##  [4] limma_3.40.0         ggbeeswarm_0.6.0     RColorBrewer_1.1-2  
##  [7] reshape2_1.4.3       ggplot2_3.1.1        rtracklayer_1.44.0  
## [10] GenomicRanges_1.36.0 GenomeInfoDb_1.20.0  IRanges_2.18.0      
## [13] S4Vectors_0.22.0     BiocGenerics_0.30.0 
## 
## loaded via a namespace (and not attached):
##  [1] Biobase_2.44.0              splines_3.6.1              
##  [3] Formula_1.2-3               assertthat_0.2.1           
##  [5] latticeExtra_0.6-28         GenomeInfoDbData_1.2.1     
##  [7] vipor_0.4.5                 Rsamtools_2.0.0            
##  [9] yaml_2.2.0                  pillar_1.3.1               
## [11] backports_1.1.4             lattice_0.20-38            
## [13] glue_1.3.1                  digest_0.6.18              
## [15] XVector_0.24.0              checkmate_1.9.3            
## [17] colorspace_1.4-1            htmltools_0.3.6            
## [19] Matrix_1.2-17               plyr_1.8.4                 
## [21] XML_3.98-1.19               pkgconfig_2.0.2            
## [23] zlibbioc_1.30.0             purrr_0.3.2                
## [25] scales_1.0.0                BiocParallel_1.18.0        
## [27] tibble_2.1.1                htmlTable_1.13.1           
## [29] withr_2.1.2                 SummarizedExperiment_1.14.0
## [31] nnet_7.3-12                 lazyeval_0.2.2             
## [33] survival_2.44-1.1           magrittr_1.5               
## [35] crayon_1.3.4                evaluate_0.13              
## [37] foreign_0.8-71              beeswarm_0.2.3             
## [39] data.table_1.12.2           tools_3.6.1                
## [41] matrixStats_0.54.0          stringr_1.4.0              
## [43] munsell_0.5.0               locfit_1.5-9.1             
## [45] cluster_2.0.9               DelayedArray_0.10.0        
## [47] Biostrings_2.52.0           compiler_3.6.1             
## [49] rlang_0.3.4                 grid_3.6.1                 
## [51] RCurl_1.95-4.12             rstudioapi_0.10            
## [53] htmlwidgets_1.3             labeling_0.3               
## [55] bitops_1.0-6                base64enc_0.1-3            
## [57] rmarkdown_1.12              gtable_0.3.0               
## [59] R6_2.4.0                    GenomicAlignments_1.20.0   
## [61] gridExtra_2.3               dplyr_0.8.0.1              
## [63] Hmisc_4.2-0                 stringi_1.4.3              
## [65] Rcpp_1.0.1                  rpart_4.1-15               
## [67] acepack_1.4.1               tidyselect_0.2.5           
## [69] xfun_0.6